x<-3
print(x)
if(x<5){
print("The value of x is less than 5")
} else{
print("The value of x is greater than or equal to 5")
}
add5<-function(x){
if(is.numeric(x)){
theanswer<-x+5
return(theanswer)
}
else{
print("Argument x is not a number")
return(0)
}
}
print(paste0("42 plus five is equal to: ", add5(52)))
print(add5("sdafgdsfg"))
running_sum<-0
for (i in 1:10){
running_sum = running_sum+i
}
print(running_sum)
1:10
df<-data.frame(variable1 = c(5,7,9), variable2=c(2,4,6))
df
df$variable2_squared <- df$variable2^2
df
projectpath<-"C:/Users/Len/Dropbox/Teaching/Data TA/R tutorial/sample project/"
df <- read.csv(paste0(projectpath,"raw data/qcew_wages_industrybyyear_naics.csv"))
df[1:5,c('naics','wage')]
# A "comment" in R can be made with the "#" symbol
# Let's ask R to print how many records are in df:
print(length(df[,1]))
#First five values of wage only:
df$wage[1:5]
print(paste0("Mean industry wage: ", mean(df$wage)))
print(paste0("Max industry wage: ", max(df$wage)))
#What if I want to display these more pretty-like?
print(paste0("Mean industry wage: $", round(mean(df$wage),2)))
print(paste0("Mean industry wage: $", round(mean(df$wage),-3)/1000, "k"))
df
with the logical vector df$naics==2121 & df$year==1990
:¶df[df$naics==2121 & df$year==1990,]
#Be careful, if you used "&&" here it would only apply the AND operation on the first row!
#Step one: create a dataframe with average manufacturing wage, by year
dfcoal<-df[df$naics==2121,]
dfcoal
ggplot
plotting library for this, which must be installed. ggplot
has a formula based syntax that takes some getting used to.¶#install.packages("ggplot2")
library(ggplot2)
#ggplot titles by default show up on the left. I like them in the middle:
theme_update(plot.title = element_text(hjust = 0.5))
ggplot(data=dfcoal, aes(x=year, y=wage, group=1)) +
geom_line()+
geom_point()+
labs(title="U.S. Coal Industry Wages Over Time", x="Year", y = "Average nominal wage")
ggsave(filename = paste0(projectpath,'figures/coal_wagesbyyear.png'), width = 10, height = 5, dpi=500)
projectpath<-"C:/Users/Len/Dropbox/Teaching/Data TA/R tutorial/sample project/"
#split=TRUE allows output to also go to the screen (otherwise you would only see it in the log file)
#append=FALSE tells R to overwrite "mylog.txt" rather than appending to itself, if it already exists
sink(file=paste0(projectpath,'/log files/mylog.txt'),split=TRUE, append=FALSE)
sink.reset <- function(){
for(i in seq_len(sink.number())){
sink(NULL)
}
}
raw data/
folder. Or, you can read it right of the web (and save a local copy):¶df<- read.csv("http://www.columbia.edu/~ltg2111/teaching/wages_bycounty_yr2000.csv")
write.csv(df, paste0(projectpath,"raw data/wages_bycounty_yr2000.csv"))
df <- read.csv(paste0(projectpath,"raw data/wages_bycounty_yr2000.csv"))
df[1:5,]
ggplot
do it all automatically:¶#Scatterplot of wage vs employment with regression line
library(ggplot2)
ggplot(data = df, aes(x = employment,y = wage)) +
geom_point(color='cyan') +
geom_smooth(method = "lm", se = FALSE)
loess
local smoother, we can assess the linearity if the relationship. Here we zoom in on counties with average wage between 20-30k and employment below 100k, where the bulk of the data is.¶ggplot(data = df, aes(x = employment,y = wage)) +
ylim(20000, 30000) + xlim(0,10^5) +
geom_point(color='cyan') +
geom_smooth(method = "lm", se = FALSE) +
geom_smooth(method = "loess", color="green")
lm
command to see the estimated coefficients:¶lm(wage~employment,data=df)
#It's usually useful to save the estimated model as an object:
model1<-lm(wage~employment,data=df)
summary(model1)
lm
computes by default assume homoskedasticity, which is not reasonable in most social science contexts. Based on the above scatterplot, would it be reasonable to assume homoskedastic errors for this regression? Doesn't look like it.¶sandwich
library: ¶#install.packages("sandwich")
library(sandwich)
cov <- vcovHC(model1, type = "HC")
model1.robustses <- sqrt(diag(cov))
model1.robustses
model1$coefficients
#If we wanted to supress the constant:
lm(wage~employment+0,data=df)
#Suppose we wanted to inspect the residuals:
hist(model1$residuals, breaks=50,xlim=c(-25000,25000))
temp<- read.csv("https://raw.githubusercontent.com/plotly/datasets/master/fips-unemp-16.csv")
write.csv(temp, paste0(projectpath,"raw data/fips-unemp-16.csv"))
df2<-read.csv(paste0(projectpath,"raw data/fips-unemp-16.csv"))
merge
command:¶merged_data <- merge(df, df2, by="fips", all.x = TRUE, all.y=FALSE)
merged_data[1:5,]
by
here can be a list of variables to merge on combinations of, e.g. by=c(var1, var2)
.¶all.x = TRUE, all.y=FALSE
makes sure to keep all of our original county observations, even if we can't find an unemployment number for them. Counties that show up only in the unemployment dataset are ignored.¶unemp
:¶summary(merged_data)
merged_data[is.na(merged_data$unemp),]
unemp
will be ignored by the lm
command):¶model2<-lm(wage~employment+unemp,data=merged_data)
summary(model2)
# Again, if we are interested in statistical inference on our coefficients, we should generally use
# standard errors that are valid under heteroskedasticity:
cov <- vcovHC(model2, type = "HC")
model2.robustses <- sqrt(diag(cov))
model2.robustses
head(merged_data)
somevars <- names(merged_data) %in% c("X.x", "X.y")
cleaned_data <- merged_data[!somevars]
head(cleaned_data)
#Create a new variable that groups employment into 4 quartiles
qnt <- quantile(cleaned_data$employment)
cleaned_data$employment_quartile<-cut(cleaned_data$employment,unique(qnt),include.lowest=TRUE)
#Sort the data frame by employment
cleaned_data <- cleaned_data[order(cleaned_data$employment),]
#Re-order first few columns in data frame.
#In this exmaple, make "employment" and "employment_quartile" first and retain ordering of the rest
somecols <- c("employment", "employment_quartile");
cols <- c(somecols, names(cleaned_data)[-which(names(cleaned_data) %in% somecols)]);
cleaned_data <- cleaned_data[cols]
#rename the "fips" variable to "county"
colnames(cleaned_data)[which(colnames(cleaned_data) == 'fips')[[1]]] <- "county"
head(cleaned_data)
dplyr
package, another is data.table
¶data.table
, we can quickly extract the average employment across counties within each quartile of employment¶library(data.table)
dt = data.table(cleaned_data)
dt <- dt[, .(mean(employment)), keyby = .(employment_quartile)]
#the column with mean employment ends up with the non-descriptive name "V1", so let's change it
colnames(dt)[[2]]<-"mean_emp_byqt"
dt
#We may want to merge this information back to the original dataset:
cleaned_data <- merge(cleaned_data, dt, by="employment_quartile")
head(cleaned_data)
employment
and wage
across all counties. We could do this with the lapply funciton, which would allow us to loop through any set of variable names and ask for its mean:¶nothing<-lapply(c("employment", "wage"), function(thisvar){
print(paste0("Mean of variable ", thisvar, ": ",mean(cleaned_data[,thisvar])))
})
#install.packages("stargazer")
library("stargazer")
stargazer(model1, model1, model2, se=list(NULL, model1.robustses, model2.robustses), column.labels=c("not robust","robust", "robust"), align=TRUE, type="text")
stargazer(model1, model1, model2, se=list(NULL, model1.robustses, model2.robustses), column.labels=c("not robust","robust", "robust"), align=TRUE, float=F, type="latex", out=paste0(projectpath,"/tables/table1.tex"))
df$state.f <- factor(df$state)